import requests
from bs4 import BeautifulSoup

from spider_utils.csv_utils import img_data_to_csv
from spider_utils.img_utils import get_img_mes
from spider_utils.tag_utils import get_tags

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/89.0.4389.128 Safari/537.36'
}

session = requests.session()

baseURL = 'https://sc.chinaz.com'

# 数据列表
img_data_list = []

# 源站点
src_site = 'https://sc.chinaz.com'


# 获取所有主tag列表
main_tag_list = []
r = session.request('GET', baseURL+'/tupian/', headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
a_tags = soup.select('.flh')
for a_tag in a_tags:
    main_tag_list.append(a_tag['href'])
# print(main_tag_list)

# 遍历主tag列表获取图片所在页面链接列表
# print(len(main_tag_list))  15

i = 0

for main_tag in main_tag_list:

    url = baseURL + main_tag
    r = session.request('GET', url, headers=headers, timeout=5)
    # 获取最大页数
    soup = BeautifulSoup(r.text, 'html.parser')
    max_page = soup.select('b')[-1].string
    # print(max_page)
    # 生成页面链接列表
    page_link_list = []
    for page in range(1, int(max_page)+1):
        link = url[:-5]+'_'+str(page)+'.html' if page != 1 else url
        page_link_list.append(link)
    for page_link in page_link_list:
        r = session.request('GET', page_link, headers=headers, timeout=5)
        soup = BeautifulSoup(r.text, 'html.parser')
        # 获取图片页面列表
        img_page_link_list = []
        a_tags = soup.select('.picblock div > a')
        for a_tag in a_tags:
            img_page_link_list.append('http:'+a_tag['href'])
        # print(img_page_link_list)
        # 遍历链接列表获取图片信息
        for link in img_page_link_list:

            print(i)
            i = i + 1

            try:
                img_data = {'src_site': src_site}
                r = session.request('GET', link, headers=headers, timeout=5)
                r.encoding = 'utf-8'
                soup = BeautifulSoup(r.text, 'html.parser')
                # 获取标题获取标签
                title = soup.select('.text_wrap a')[0].string
                img_data['tags'] = ','.join(get_tags(title))
                # 获取图片链接
                img_data['src'] = 'http:' + soup.select('.imga img')[0]['src']
                img_data['outer_net'] = 0
                orther_mes = get_img_mes(img_data['src'])
                img_data['colors'] = orther_mes['colors']
                img_data['size'] = orther_mes['size']
                img_data_list.append(img_data)
            except:
                continue

# print(img_data_list)
img_data_to_csv(img_data_list, img_data_list[0].keys(), '../../data/chinaz_data.csv')